library(limma)
library(Glimma)
library(edgeR)
library(dplyr)
library(magrittr)
setwd('~/Desktop/USC/Fall_2020/TRGN_510/510_FinalProject/adenocarcinoma_vs_squamous_cell_carcinoma')
# Reading in count data
count_matrix = read.table(file = 'count_processed.txt',sep = '\t',header = T,stringsAsFactors = F, check.names = F, row.names = 1)
count_matrix = count_matrix[,order(names(count_matrix))]
#Reading in clinical metadata
clinical_metadata = read.table(file = 'clinical_processed.txt', sep = '\t', header = T, as.is = T, check.names = F)
clinical_metadata = clinical_metadata[!duplicated(clinical_metadata[ , c("case_submitter_id")]),]
# Reading in exposure metadata
exposure_metadata = read.csv(file = 'exposure_processed.txt', sep = '\t', header = T, as.is = T, check.names = F)
# Merging metadata
all_metadata = merge(x = clinical_metadata, y = exposure_metadata, by = 'case_submitter_id')
all_metadata$ethnicity <- as.factor(all_metadata$ethnicity)
all_metadata$gender <- as.factor(all_metadata$gender)
all_metadata%<>%
mutate(diagnosis=case_when(
primary_diagnosis %in% c("Adenocarcinoma with mixed subtypes","Adenocarcinoma, NOS","Bronchiolo-alveolar adenocarcinoma, NOS","Papillary adenocarcinoma, NOS") ~ c("Adenocarcinoma"),
primary_diagnosis %in% c("Squamous cell carcinoma, large cell, nonkeratinizing, NOS","Squamous cell carcinoma, NOS") ~ c("Squamous_cell_carcinoma")
))
all_metadata$diagnosis = as.factor(all_metadata$diagnosis)
all_metadata$race <- as.factor(all_metadata$race)
rownames(all_metadata) <- all_metadata$case_submitter_id
all_metadata = all_metadata[,-1]
all_metadata = all_metadata[order(rownames(all_metadata)),]
# Creating DGEList Object
geneExpr = DGEList(counts = count_matrix, samples = all_metadata)
geneExpr$samples$group=all_metadata$diagnosis
geneid = rownames(geneExpr)
# Importing gencode reference from DGC website
gencode = read.table('gencode.gene.info.v22.tsv',sep = '\t',header = T,stringsAsFactors = F, check.names = F)
genes = gencode[geneid %in% gencode$gene_id, ]
genes = genes[,c("gene_id","gene_name","seqname")]
rownames(genes) = genes$gene_id
genes = genes[order(genes$gene_id),]
geneExpr$genes = genes[,-1]
cpm <- cpm(geneExpr)
lcpm <- cpm(geneExpr, log=TRUE)
L <- mean(geneExpr$samples$lib.size) * 1e-6
M <- median(geneExpr$samples$lib.size) * 1e-6
c(L,M)
[1] 52.92581 48.81745
The average library size for this dataset is about 61.1 million.
table(rowSums(geneExpr$counts==0)==75)
FALSE
60483
keep.exprs <- filterByExpr(geneExpr, group=geneExpr$samples$group)
Around 11% of genes in the dataset have zero counts across all 75 samples.
geneExpr <- geneExpr[keep.exprs,, keep.lib.sizes=FALSE]
dim(geneExpr)
[1] 21170 40
In this dataset, the median library size is 61.5 million and 10/61.5 is about 0.2, therefore the filterByExpr function keeps genes that have a CPM of 0.2 or more. With this cutoff the number of genes are reduced to 22200, about 36% of genes from what I started with.
lcpm.cutoff <- log2(10/M + 2/L)
library(RColorBrewer)
nsamples <- ncol(geneExpr)
col <- brewer.pal(nsamples, "Paired")
par(mfrow=c(1,2))
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.7), las=2, main="", xlab="")
title(main="A. Raw data", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcpm[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", colnames(geneExpr$counts), text.col=col, bty="n")
lcpm_filtered <- cpm(geneExpr, log=TRUE)
plot(density(lcpm_filtered[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="B. Filtered data", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcpm_filtered[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", colnames(geneExpr$counts), text.col=col, bty="n")
This graph showed that before filtering the data, a large portion of genes within each samples are lowly-expressed with log-CPM values that are small or negative.
geneExpr = calcNormFactors(geneExpr, method = "TMM")
geneExpr$samples$norm.factors
[1] 1.0391573 0.9244982 0.8856329 1.0521559 1.0145085 1.1196887 1.0727778 1.3600421 1.0171641 0.9386345
[11] 0.8890663 0.6747847 0.9553945 1.1161617 0.9624960 1.1869816 1.0062698 0.8520753 0.8883963 1.0043621
[21] 0.8191521 1.0129474 0.9248736 1.1260210 1.0694870 0.7173413 1.0284701 1.0027376 1.0325592 1.1019957
[31] 1.0554783 0.9856847 1.0846584 1.0126931 1.1959792 0.9179716 1.1264297 1.0857329 1.0489496 1.0082920
geneExpr_unnormalized <- geneExpr
geneExpr_unnormalized$samples$norm.factors <- 1
par(mfrow=c(1,2))
lcpm_unnormalized <- cpm(geneExpr_unnormalized, log=TRUE)
boxplot(lcpm_unnormalized, las=2, col=col, main="")
title(main="A. Example: Unnormalised data",ylab="Log-cpm")
geneExpr_normalized <- calcNormFactors(geneExpr_unnormalized)
geneExpr_normalized$samples$norm.factors
[1] 1.0391573 0.9244982 0.8856329 1.0521559 1.0145085 1.1196887 1.0727778 1.3600421 1.0171641 0.9386345
[11] 0.8890663 0.6747847 0.9553945 1.1161617 0.9624960 1.1869816 1.0062698 0.8520753 0.8883963 1.0043621
[21] 0.8191521 1.0129474 0.9248736 1.1260210 1.0694870 0.7173413 1.0284701 1.0027376 1.0325592 1.1019957
[31] 1.0554783 0.9856847 1.0846584 1.0126931 1.1959792 0.9179716 1.1264297 1.0857329 1.0489496 1.0082920
lcpm_normalized <- cpm(geneExpr_normalized, log=TRUE)
boxplot(lcpm_normalized, las=2, col=col, main="")
title(main="B. Example: Normalised data",ylab="Log-cpm")
This figure showed that after normalization the samples are more similar to each other.
lcpm <- cpm(geneExpr, log=TRUE)
par(mfrow=c(1,2))
col.group <- geneExpr$samples$group
levels(col.group) <- brewer.pal(nlevels(col.group), "Set1")
col.group <- as.character(col.group)
col.race <- geneExpr$samples$race
levels(col.race) <- brewer.pal(nlevels(col.race), "Set2")
col.race <- as.character(col.race)
col.gender <- geneExpr$samples$gender
levels(col.gender) <- brewer.pal(nlevels(col.gender), "Set3")
col.gender <- as.character(col.gender)
col.age <- geneExpr$samples$age_at_diagnosis
levels(col.age) <- brewer.pal(nlevels(col.age), "Set1")
col.age <- as.character(col.age)
col.ethnicity <- geneExpr$samples$ethnicity
levels(col.ethnicity) <- brewer.pal(nlevels(col.ethnicity), "Set2")
col.ethnicity <- as.character(col.ethnicity)
col.cigar_per_day <- geneExpr$samples$cigarettes_per_day
levels(col.cigar_per_day) <- brewer.pal(nlevels(col.cigar_per_day), "Set3")
col.cigar_per_day <- as.character(col.cigar_per_day)
plotMDS(lcpm,labels=geneExpr$samples$group, col=col.group)
title(main="A. Sample groups")
plotMDS(lcpm, labels=geneExpr$samples$gender, col=col.gender, dim=c(3,4))
title(main="B. Sex")
plotMDS(lcpm, labels=geneExpr$samples$age_at_diagnosis, col=col.age, dim=c(3,4))
title(main="C. Age at diagnosis")
plotMDS(lcpm, labels=geneExpr$samples$race, col=col.race, dim=c(3,4))
title(main="D. Race")
plotMDS(lcpm, labels=geneExpr$samples$ethnicity, col=col.ethnicity, dim=c(3,4))
title(main="E. Ethnicity")
plotMDS(lcpm, labels=geneExpr$samples$cigarettes_per_day, col=col.cigar_per_day, dim=c(3,4))
title(main="F. Cigarettes per day")
In the MDS plot, there is no separation between group of patients with early stage lung cancer and late stage lung cancer.
diagnosis = geneExpr$samples$group
ethnicity = geneExpr$samples$ethnicity
sex = geneExpr$samples$gender
race = geneExpr$samples$race
age_at_diagnosis = geneExpr$samples$age_at_diagnosis
smoking_hist = geneExpr$samples$cigarettes_per_day
design = model.matrix(~0+diagnosis+sex)
colnames(design) <- gsub("diagnosis", "", colnames(design))
contr.matrix <- makeContrasts(
AdenocarcinomavsSquamous_cell_carcinoma = Adenocarcinoma-Squamous_cell_carcinoma,
levels = colnames(design))
contr.matrix
Contrasts
Levels AdenocarcinomavsSquamous_cell_carcinoma
Adenocarcinoma 1
Squamous_cell_carcinoma -1
sexmale 0
par(mfrow=c(1,2))
v <- voom(geneExpr, design, plot=TRUE)
v
An object of class "EList"
$genes
21165 more rows ...
$targets
35 more rows ...
$E
TCGA-22-4613 TCGA-22-5489 TCGA-22-5491 TCGA-33-AAS8 TCGA-34-5231 TCGA-43-7656 TCGA-43-8118
ENSG00000000003.13 6.578393 4.518445 7.050818 5.481583 6.016010 5.138312 5.436772
ENSG00000000419.11 5.006541 5.125550 6.009261 5.535389 5.227853 6.273798 5.677880
ENSG00000000457.12 3.899691 4.351443 4.164341 3.347290 4.310104 4.022563 3.425331
ENSG00000000460.15 3.720341 4.789466 4.070052 3.040155 4.109311 3.920045 3.333184
ENSG00000000938.11 4.311517 4.032854 2.267258 2.617683 2.708592 3.026093 5.302423
TCGA-44-5645 TCGA-44-A47G TCGA-46-3765 TCGA-46-3766 TCGA-49-4486 TCGA-49-4487 TCGA-49-AARO
ENSG00000000003.13 5.635659 5.702467 6.113541 6.379925 6.279390 5.972862 6.359283
ENSG00000000419.11 4.093028 4.281239 6.092018 5.071231 5.475511 5.916025 4.660764
ENSG00000000457.12 5.427188 3.977077 3.866716 3.928589 4.975262 4.130680 3.932304
ENSG00000000460.15 4.295921 2.528768 3.742487 2.508825 2.713849 4.464368 2.723976
ENSG00000000938.11 2.653449 5.680948 3.611814 4.778872 2.166361 5.063277 5.362237
TCGA-50-5942 TCGA-50-5946 TCGA-50-8457 TCGA-50-8460 TCGA-55-7726 TCGA-55-8089 TCGA-56-5897
ENSG00000000003.13 5.745927 5.480757 5.061664 5.801764 5.618804 5.648883 4.869300
ENSG00000000419.11 4.466833 5.072669 4.197116 4.935787 6.016039 5.237966 5.088500
ENSG00000000457.12 4.777173 4.218002 4.293238 3.792598 3.814772 4.220320 3.935572
ENSG00000000460.15 1.971207 3.939701 2.450756 1.987285 3.384872 3.679206 3.653911
ENSG00000000938.11 2.672767 1.804396 4.286484 4.685892 3.749970 5.450806 4.716595
TCGA-56-A4ZJ TCGA-56-A5DR TCGA-63-A5MH TCGA-63-A5MY TCGA-64-1676 TCGA-68-8250 TCGA-73-4662
ENSG00000000003.13 5.409876 5.176682 5.836250 5.890766 7.721230 6.362591 6.141768
ENSG00000000419.11 5.088331 5.362201 5.024777 6.699551 6.669445 6.051110 4.699274
ENSG00000000457.12 3.573813 4.258951 4.356796 4.139408 4.442916 3.510752 5.172006
ENSG00000000460.15 2.581473 4.020476 4.482014 4.332709 3.469883 3.347400 4.412238
ENSG00000000938.11 5.280737 2.910954 2.161463 2.534827 4.735432 4.758486 4.545843
TCGA-77-A5G7 TCGA-85-8048 TCGA-86-7953 TCGA-86-8076 TCGA-90-7766 TCGA-91-6828 TCGA-93-A4JO
ENSG00000000003.13 5.962677 6.170662 5.978361 5.688321 6.386395 5.854894 6.055768
ENSG00000000419.11 6.046526 5.186354 4.206129 4.600721 5.411248 4.413630 5.220789
ENSG00000000457.12 4.208445 4.329645 3.843559 4.470898 4.452007 4.641835 4.296636
ENSG00000000460.15 4.220873 4.220775 4.150741 2.488662 3.854044 2.833918 3.301439
ENSG00000000938.11 2.619316 5.029770 4.653009 4.916381 3.879929 4.778828 5.055971
TCGA-96-7545 TCGA-98-A53C TCGA-99-AA5R TCGA-NJ-A4YQ TCGA-O1-A52J
ENSG00000000003.13 6.086418 4.356866 4.898964 5.288840 5.802251
ENSG00000000419.11 5.534683 4.524946 4.397344 4.731263 5.020307
ENSG00000000457.12 3.867595 3.721552 4.155948 4.791851 4.284247
ENSG00000000460.15 2.842824 2.332691 2.017264 3.217170 3.054791
ENSG00000000938.11 4.323060 5.647853 6.152786 4.741985 5.868203
21165 more rows ...
$weights
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11]
[1,] 1.9156382 2.265171 2.264350 2.213717 2.251193 2.259541 2.213335 2.134201 2.196382 2.178912 2.103449
[2,] 1.7274124 2.263898 2.259424 2.130467 2.259430 2.249206 2.129639 1.709450 1.843064 2.068303 1.956363
[3,] 1.1247909 1.889080 1.831902 1.519393 2.058846 1.750188 1.518138 1.492255 1.626831 1.428855 1.303977
[4,] 0.9981961 1.764861 1.705691 1.323075 1.957294 1.621630 1.321983 1.007585 1.085743 1.245279 1.141339
[5,] 1.2599568 1.638856 1.578728 1.704841 1.839755 1.497199 1.703511 1.700961 1.835126 1.608547 1.469780
[,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22]
[1,] 2.260762 1.9845015 2.199452 2.147720 2.261012 2.239719 2.192143 1.912564 2.265181 2.254155 2.229398
[2,] 2.159220 1.4994892 1.850504 1.736032 2.236428 1.982529 1.936786 1.422262 2.210388 2.240132 2.157465
[3,] 1.918998 1.3029032 1.634603 1.518237 2.096214 1.780240 1.608988 1.237064 2.026705 1.700806 1.572091
[4,] 1.377043 0.9075725 1.090570 1.022096 1.590228 1.189021 1.130158 0.874648 1.494592 1.571770 1.369460
[5,] 1.696029 1.4916402 1.842526 1.728006 1.913948 1.975738 1.388865 1.414733 1.821489 1.450037 1.757968
[,23] [,24] [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32] [,33]
[1,] 2.217851 2.221969 2.247060 2.0726890 2.238861 2.258105 2.162523 2.207645 2.263954 2.264046 2.260432
[2,] 2.188616 2.194131 2.229313 1.7227708 2.217553 2.217851 2.116843 2.174970 2.125441 2.182022 2.229632
[3,] 1.526228 1.539838 1.649835 1.3955026 1.609251 2.124609 1.389669 1.493012 1.965746 1.964307 1.768653
[4,] 1.405102 1.417681 1.522205 0.9983615 1.483421 1.549154 1.279861 1.374249 1.347076 1.422712 1.554026
[5,] 1.295786 1.307260 1.403482 1.2056881 1.367369 2.215406 1.183066 1.267782 2.120177 1.746381 1.945727
[,34] [,35] [,36] [,37] [,38] [,39] [,40]
[1,] 2.0272364 2.252693 2.259175 2.235016 2.216680 2.171008 2.221055
[2,] 1.6608334 2.118335 2.248536 2.170787 1.901291 1.781816 1.914839
[3,] 1.3415540 1.847571 1.745479 1.598156 1.688464 1.563408 1.703532
[4,] 0.9676036 1.311413 1.616765 1.392875 1.125273 1.047906 1.134973
[5,] 1.1615881 1.619536 1.492648 1.784314 1.893762 1.773524 1.907243
21165 more rows ...
$design
Adenocarcinoma Squamous_cell_carcinoma sexmale
1 0 1 0
2 0 1 1
3 0 1 1
4 0 1 0
5 0 1 1
35 more rows ...
vfit <- lmFit(v, design)
vfit <- contrasts.fit(vfit, contrasts=contr.matrix)
efit <- eBayes(vfit)
plotSA(efit, main="Final model: Mean-variance trend")
summary(decideTests(efit))
AdenocarcinomavsSquamous_cell_carcinoma
Down 2482
NotSig 16700
Up 1988
res = topTable(efit,sort.by = "P",n=Inf)
head(res)
tfit <- treat(vfit, lfc=1)
dt <- decideTests(tfit)
summary(dt)
AdenocarcinomavsSquamous_cell_carcinoma
Down 320
NotSig 20769
Up 81
plotMD(tfit, column=1, status=dt[,1], main=colnames(tfit)[1],
xlim=c(-8,13))
glMDPlot(tfit, coef=1, status=dt, main=colnames(tfit)[1],
side.main="gene_name", counts=lcpm, groups=geneExpr$samples$group, launch=FALSE)
library(Homo.sapiens)
geneid = v$genes$gene_name
genes = select(Homo.sapiens, keys = geneid, columns = "ENTREZID", keytype = "SYMBOL")
genes <- genes[!duplicated(genes$SYMBOL),]
genes_v = v$genes
genes_v$ensembl = rownames(genes_v)
genes_v_new = merge(genes_v,genes,by.x="gene_name",by.y="SYMBOL",sort=F)
rownames(genes_v_new) = genes_v_new$ensembl
genes_v_new = genes_v_new[,-3]
v$genes = genes_v_new
load("human_c2_v5p2.rdata")
idx <- ids2indices(Hs.c2,id=v$genes$ENTREZID)
cam.AdenovsSqua <- camera(v,idx,design,contrast = contr.matrix[,1])
head(cam.AdenovsSqua,20)
NA
barcodeplot(efit$t[,1], index=idx$AMIT_EGF_RESPONSE_20_MCF10A,
index2=idx$DORN_ADENOVIRUS_INFECTION_24HR_DN, main="Adenocarcinoma Vs Squamous Cell Carcinoma")